﻿using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using Stemming;

namespace Preprocess
{
    class Program
    {
        static void Main(string[] args)
        {
            string[] files = { 
                "..\\..\\..\\..\\data\\cs.txt", 
                "..\\..\\..\\..\\data\\computer.txt",
                "..\\..\\..\\..\\data\\infotheory.txt", 
                "..\\..\\..\\..\\data\\InfoRet.txt",
                "..\\..\\..\\..\\data\\datamining.txt", 
                "..\\..\\..\\..\\data\\cryptography.txt", 
                "..\\..\\..\\..\\data\\cryptanalysis.txt",
                "..\\..\\..\\..\\data\\historycrypto.txt",
                "..\\..\\..\\..\\data\\database.txt"
            };

            string[] outFiles = { 
                "..\\..\\..\\..\\data\\vocab.txt", 
                "..\\..\\..\\..\\data\\cs.index.txt",
                "..\\..\\..\\..\\data\\computer.index.txt",
                "..\\..\\..\\..\\data\\infotheory.index.txt", 
                "..\\..\\..\\..\\data\\InfoRet.index.txt",
                "..\\..\\..\\..\\data\\datamining.index.txt", 
                "..\\..\\..\\..\\data\\cryptography.index.txt", 
                "..\\..\\..\\..\\data\\cryptanalysis.index.txt",
                "..\\..\\..\\..\\data\\historycrypto.index.txt",
                "..\\..\\..\\..\\data\\database.index.txt"
            };

            HashSet<string> vocabulary = new HashSet<string>();

            string[][] docwords = new string[files.Length][];
            //finish process files
            for(int j=0;j<files.Length;j++)
            {
                StopWordsHandler swh = new StopWordsHandler();
                TextReader tr = new StreamReader(files[j]);
                string doc = tr.ReadToEnd();

                Tokeniser tk = new Tokeniser();
                docwords[j] = tk.Partition(doc);

                PorterStemmer stem = new PorterStemmer();
                for (int i = 0; i < docwords[j].Length; i++)
                {
                    docwords[j][i] = stem.stemTerm(docwords[j][i]);
                    vocabulary.Add(docwords[j][i]);
                }

                tr.Close();
            }

            //output files

            Dictionary<string, int> dict = new Dictionary<string, int>();
            int index = 1;
            TextWriter dictFile = new StreamWriter(outFiles[0]);
            StreamWriter ldaDictFile = new StreamWriter("ldaVocab.txt");
            foreach (string word in vocabulary)
            {
                dict[word] = index;
                dictFile.WriteLine("{0} & {1}", index, word);
                ldaDictFile.WriteLine(word);
                index++;
            }
            dictFile.Close();
            ldaDictFile.Close();

            StreamWriter ldaData = new StreamWriter("ldaData.txt");

            for (int i = 1; i < outFiles.Length; i++)
            {
                TextWriter tw = new StreamWriter(outFiles[i]);
                Dictionary<int, int> sparse = new Dictionary<int, int>();
                for (int j = 0; j < docwords[i-1].Length; j++)
                {
                    tw.WriteLine("{0} & {1}", dict[docwords[i - 1][j]], docwords[i - 1][j]);
                    if (sparse.ContainsKey(dict[docwords[i - 1][j]]))
                    {
                        sparse[dict[docwords[i - 1][j]]]++;
                    }
                    else
                    {
                        sparse[dict[docwords[i - 1][j]]] = 1;
                    }
                }
                tw.Close();

                //need to count number of times each word appears in document

                ldaData.Write(sparse.Count);
                foreach (KeyValuePair<int, int> kvp in sparse)
                {
                    ldaData.Write(" {0}:{1}", kvp.Key - 1, kvp.Value);
                }
                ldaData.WriteLine();
                ldaData.WriteLine();
            }

            ldaData.Close();
        }
    }
}
